<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE topic
  PUBLIC "-//OASIS//DTD DITA Composite//EN" "ditabase.dtd">
<topic id="topic1" xml:lang="en">
  <title id="ii20941">gpmapreduce.yaml</title>
  <body>
    <p>gpmapreduce configuration file.</p>
    <section id="section2">
      <title>Synopsis</title>
      <codeblock>%YAML 1.1
---
<xref href="#topic1/VERSION" format="dita">VERSION</xref>: 1.0.0.2
<xref href="#topic1/DATABASE" format="dita">DATABASE</xref>: dbname
<xref href="#topic1/USER" format="dita">USER</xref>: db_username
<xref href="#topic1/HOST" format="dita">HOST</xref>: master_hostname
<xref href="#topic1/PORT" format="dita">PORT</xref>: master_port</codeblock>
      <codeblock>  - <xref href="#topic1/DEFINE" format="dita">DEFINE</xref>: 
  - <xref href="#topic1/INPUT" format="dita">INPUT</xref>:
     <xref href="#topic1/NAME" format="dita">NAME</xref>: input_name
     <xref href="#topic1/FILE" format="dita">FILE</xref>: 
       - <i>hostname</i>: /path/to/file
     <xref href="#topic1/GPFDIST" format="dita">GPFDIST</xref>:
       - <i>hostname</i>:port/file_pattern
     <xref href="#topic1/TABLE" format="dita">TABLE</xref>: table_name
     <xref href="#topic1/QUERY" format="dita">QUERY</xref>: SELECT_statement
     <xref href="#topic1/EXEC" format="dita">EXEC</xref>: command_string
     <xref href="#topic1/COLUMNS" format="dita">COLUMNS</xref>:
       - field_name data_type
     <xref href="#topic1/FORMAT" format="dita">FORMAT</xref>: TEXT | CSV
     <xref href="#topic1/DELIMITER" format="dita">DELIMITER</xref>: delimiter_character
     <xref href="#topic1/ESCAPE" format="dita">ESCAPE</xref>: escape_character
     <xref href="#topic1/NULL" format="dita">NULL</xref>: null_string
     <xref href="#topic1/QUOTE" format="dita">QUOTE</xref>: csv_quote_character
     <xref href="#topic1/ERROR_LIMIT" format="dita">ERROR_LIMIT</xref>: integer
     <xref href="#topic1/ENCODING" format="dita">ENCODING</xref>: database_encoding</codeblock>
      <codeblock>  - <xref href="#topic1/OUTPUT" format="dita">OUTPUT</xref>:
     <xref href="#topic1/OUTPUTNAME" format="dita">NAME</xref>: output_name
     <xref href="#topic1/OUTPUTFILE" format="dita">FILE</xref>: file_path_on_client
     <xref href="#topic1/OUTPUTTABLE" format="dita">TABLE</xref>: table_name
     <xref href="#topic1/KEYS" format="dita">KEYS</xref>:        - column_name
     <xref href="#topic1/MODE" format="dita">MODE</xref>: REPLACE | APPEND</codeblock>
      <codeblock>  - <xref href="#topic1/MAP" format="dita">MAP</xref>:
     <xref href="#topic1/NAME" format="dita">NAME</xref>: function_name
     <xref href="#topic1/FUNCTION" format="dita">FUNCTION</xref>: function_definition
     <xref href="#topic1/LANGUAGE" format="dita">LANGUAGE</xref>: perl | python | c
     <xref href="#topic1/LIBRARY" format="dita">LIBRARY</xref>: /path/filename.so
     <xref href="#topic1/PARAMETERS" format="dita">PARAMETERS</xref>: 
       - nametype
     <xref href="#topic1/RETURNS" format="dita">RETURNS</xref>: 
       - nametype
     <xref href="#topic1/OPTIMIZE" format="dita">OPTIMIZE</xref>: STRICT IMMUTABLE
     <xref href="#topic1/MODE" format="dita">MODE</xref>: SINGLE | MULTI</codeblock>
      <codeblock>  - <xref href="#topic1/TCF" format="dita">TRANSITION | CONSOLIDATE | FINALIZE</xref>:
     <xref href="#topic1/TCFNAME" format="dita">NAME</xref>: function_name
     <xref href="#topic1/FUNCTION" format="dita">FUNCTION</xref>: function_definition
     <xref href="#topic1/LANGUAGE" format="dita">LANGUAGE</xref>: perl | python | c
     <xref href="#topic1/LIBRARY" format="dita">LIBRARY</xref>: /path/filename.so
     <xref href="#topic1/PARAMETERS" format="dita">PARAMETERS</xref>: 
       - nametype
     <xref href="#topic1/RETURNS" format="dita">RETURNS</xref>: 
       - nametype
     <xref href="#topic1/OPTIMIZE" format="dita">OPTIMIZE</xref>: STRICT IMMUTABLE
     <xref href="#topic1/TCFMODE" format="dita">MODE</xref>: SINGLE | MULTI</codeblock>
      <codeblock>  - <xref href="#topic1/REDUCE" format="dita">REDUCE</xref>:
     <xref href="#topic1/REDUCENAME" format="dita">NAME</xref>: reduce_job_name
     <xref href="#topic1/TRANSITION" format="dita">TRANSITION</xref>: transition_function_name
     <xref href="#topic1/CONSOLIDATE" format="dita">CONSOLIDATE</xref>: consolidate_function_name
     <xref href="#topic1/FINALIZE" format="dita">FINALIZE</xref>: finalize_function_name
     <xref href="#topic1/INITIALIZE" format="dita">INITIALIZE</xref>: value
     <xref href="#topic1/REDUCEKEYS" format="dita">KEYS</xref>:
       - key_name</codeblock>
      <codeblock>  - <xref href="#topic1/TASK" format="dita">TASK</xref>:
     <xref href="#topic1/TASKNAME" format="dita">NAME</xref>: task_name
     <xref href="#topic1/SOURCE" format="dita">SOURCE</xref>: input_name
     <xref href="#topic1/TASKMAP" format="dita">MAP</xref>: map_function_name
     <xref href="#topic1/REDUCE" format="dita">REDUCE</xref>: reduce_function_name
<xref href="#topic1/EXECUTE" format="dita">EXECUTE</xref>:</codeblock>
      <codeblock>  - <xref href="#topic1/RUN" format="dita">RUN</xref>:
     <xref href="#topic1/EXECUTESOURCE" format="dita">SOURCE</xref>: input_or_task_name
     <xref href="#topic1/TARGET" format="dita">TARGET</xref>: output_name
     <xref href="#topic1/EXECUTEMAP" format="dita">MAP</xref>: map_function_name
     <xref href="#topic1/EXECUTEREDUCE" format="dita">REDUCE</xref>: reduce_function_name...</codeblock>
    </section>
    <section id="section4">
      <title>Description</title>
      <p>You specify the input, map and reduce tasks, and the output for the
        Greenplum MapReduce <codeph>gpmapreduce</codeph> program in a
        YAML-formatted configuration file. (This reference page uses the name
        <codeph>gpmapreduce.yaml</codeph> when referring to this file; you may
        choose your own name for the file.)</p>
      <p>The <codeph>gpmapreduce</codeph> utility processes the YAML configuration
        file in order, using indentation (spaces) to determine the document hierarchy
        and the relationships between the sections. The use of white space in the
        file is significant.</p>
    </section>
    <section id="section5">
      <title>Keys and Values</title>
      <parml>
        <plentry>
          <pt id="VERSION">VERSION</pt>
          <pd>
            <p>Required. The version of the Greenplum MapReduce YAML specification. Current supported
              versions are 1.0.0.1, 1.0.0.2, and 1.0.0.3.</p>
          </pd>
        </plentry>
        <plentry>
          <pt id="DATABASE">DATABASE</pt>
          <pd>Optional. Specifies which database in Greenplum to connect to. If not specified,
            defaults to the default database or <codeph>$PGDATABASE</codeph> if set.</pd>
        </plentry>
        <plentry>
          <pt id="USER">USER</pt>
          <pd>Optional. Specifies which database role to use to connect. If not specified, defaults
            to the current user or <codeph>$PGUSER</codeph> if set. You must be a Greenplum
            superuser to run functions written in untrusted Python and Perl. Regular database users
            can run functions written in trusted Perl. You also must be a database superuser to run
            MapReduce jobs that contain <xref href="#topic1/FILE" format="dita">FILE</xref>, <xref
              href="#topic1/GPFDIST" format="dita">GPFDIST</xref> and <xref href="#topic1/EXEC"
              format="dita">EXEC</xref> input types.</pd>
        </plentry>
        <plentry>
          <pt id="HOST">HOST</pt>
          <pd>Optional. Specifies Greenplum master host name. If not specified, defaults to
            localhost or <codeph>$PGHOST</codeph> if set.</pd>
        </plentry>
        <plentry>
          <pt id="PORT">PORT</pt>
          <pd>Optional. Specifies Greenplum master port. If not specified, defaults to 5432 or
              <codeph>$PGPORT</codeph> if set.</pd>
        </plentry>
        <plentry>
          <pt id="DEFINE">DEFINE</pt>
          <pd>Required. A sequence of definitions for this MapReduce document. The
              <codeph>DEFINE</codeph> section must have at least one <codeph>INPUT</codeph>
            definition. <parml>
              <plentry>
                <pt id="INPUT">INPUT</pt>
                <pd>Required. Defines the input data. Every MapReduce document must have at least
                  one input defined. Multiple input definitions are allowed in a document, but each
                  input definition can specify only one of these access types: a file, a
                    <codeph>gpfdist</codeph> file reference, a table in the database, an SQL
                  command, or an operating system command. See <codeph/> for information about this reference.<parml>
                    <plentry>
                      <pt id="NAME">NAME</pt>
                      <pd>
                        <p>A name for this input. Names must be unique with regards to the names of
                          other objects in this MapReduce job (such as map function, task, reduce
                          function and output names). Also, names cannot conflict with existing
                          objects in the database (such as tables, functions or views).</p>
                      </pd>
                    </plentry>
                    <plentry>
                      <pt id="FILE">FILE</pt>
                      <pd>A sequence of one or more input files in the format:
                          <codeph>seghostname:/path/to/filename</codeph>. You must be a Greenplum
                        Database superuser to run MapReduce jobs with <codeph>FILE</codeph> input.
                        The file must reside on a Greenplum segment host.</pd>
                    </plentry>
                  </parml><parml>
                    <plentry>
                      <pt id="GPFDIST">GPFDIST</pt>
                      <pd>A sequence identifying one or more running <codeph>gpfdist</codeph> file
                        servers in the format: <codeph>hostname[:port]/file_pattern</codeph>. You
                        must be a Greenplum Database superuser to run MapReduce jobs with
                          <codeph>GPFDIST</codeph> input.</pd>
                    </plentry>
                    <plentry>
                      <pt id="TABLE">TABLE</pt>
                      <pd>The name of an existing table in the database.</pd>
                    </plentry>
                    <plentry>
                      <pt id="QUERY">QUERY</pt>
                      <pd>A SQL <codeph>SELECT</codeph> command to run within the database.</pd>
                    </plentry>
                    <plentry>
                      <pt id="EXEC">EXEC</pt>
                      <pd>An operating system command to run on the Greenplum segment hosts. The
                        command is run by all segment instances in the system by default. For
                        example, if you have four segment instances per segment host, the command
                        will be run four times on each host. You must be a Greenplum Database
                        superuser to run MapReduce jobs with <codeph>EXEC</codeph> input.</pd>
                    </plentry>
                    <plentry>
                      <pt id="COLUMNS">COLUMNS</pt>
                      <pd>Optional. Columns are specified as: <codeph>column_name
                          </codeph><codeph>[</codeph><codeph>data_type</codeph><codeph>]</codeph>.
                        If not specified, the default is <codeph>value text</codeph>. The <xref
                          href="#topic1/DELIMITER" format="dita">DELIMITER</xref> character is what
                        separates two data value fields (columns). A row is determined by a line
                        feed character (<codeph>0x0a</codeph>).</pd>
                    </plentry>
                    <plentry>
                      <pt id="FORMAT">FORMAT</pt>
                      <pd>
                        <p>Optional. Specifies the format of the data - either delimited text
                            (<codeph>TEXT</codeph>) or comma separated values (<codeph>CSV</codeph>)
                          format. If the data format is not specified, defaults to
                            <codeph>TEXT</codeph>.</p>
                      </pd>
                    </plentry>
                    <plentry>
                      <pt id="DELIMITER">DELIMITER</pt>
                      <pd>
                        <p>Optional for <xref href="#topic1/FILE" format="dita">FILE</xref>, <xref
                            href="#topic1/GPFDIST" format="dita">GPFDIST</xref> and <xref
                            href="#topic1/EXEC" format="dita">EXEC</xref> inputs. Specifies a single
                          character that separates data values. The default is a tab character in
                            <codeph>TEXT</codeph> mode, a comma in <codeph>CSV</codeph> mode. The
                          delimiter character must only appear between any two data value fields. Do
                          not place a delimiter at the beginning or end of a row.</p>
                      </pd>
                    </plentry>
                    <plentry>
                      <pt id="ESCAPE">ESCAPE</pt>
                      <pd>Optional for <xref href="#topic1/FILE" format="dita">FILE</xref>, <xref
                          href="#topic1/GPFDIST" format="dita">GPFDIST</xref> and <xref
                          href="#topic1/EXEC" format="dita">EXEC</xref> inputs. Specifies the single
                        character that is used for C escape sequences (such as
                          <codeph>\n</codeph>,<codeph>\t</codeph>,<codeph>\100</codeph>, and so on)
                        and for escaping data characters that might otherwise be taken as row or
                        column delimiters. Make sure to choose an escape character that is not used
                        anywhere in your actual column data. The default escape character is a \
                        (backslash) for text-formatted files and a <codeph>"</codeph> (double quote)
                        for csv-formatted files, however it is possible to specify another character
                        to represent an escape. It is also possible to disable escaping by
                        specifying the value <codeph>'OFF'</codeph> as the escape value. This is
                        very useful for data such as text-formatted web log data that has many
                        embedded backslashes that are not intended to be escapes.</pd>
                    </plentry>
                    <plentry>
                      <pt id="NULL">NULL</pt>
                      <pd>Optional for <xref href="#topic1/FILE" format="dita">FILE</xref>, <xref
                          href="#topic1/GPFDIST" format="dita">GPFDIST</xref> and <xref
                          href="#topic1/EXEC" format="dita">EXEC</xref> inputs. Specifies the string
                        that represents a null value. The default is <codeph>\N</codeph> in
                          <codeph>TEXT</codeph> format, and an empty value with no quotations in
                          <codeph>CSV</codeph> format. You might prefer an empty string even in
                          <codeph>TEXT</codeph> mode for cases where you do not want to distinguish
                        nulls from empty strings. Any input data item that matches this string will
                        be considered a null value.</pd>
                    </plentry>
                    <plentry>
                      <pt id="QUOTE">QUOTE</pt>
                      <pd>Optional for <xref href="#topic1/FILE" format="dita">FILE</xref>, <xref
                          href="#topic1/GPFDIST" format="dita">GPFDIST</xref> and <xref
                          href="#topic1/EXEC" format="dita">EXEC</xref> inputs. Specifies the
                        quotation character for <codeph>CSV</codeph> formatted files. The default is
                        a double quote (<codeph>"</codeph>). In <codeph>CSV</codeph> formatted
                        files, data value fields must be enclosed in double quotes if they contain
                        any commas or embedded new lines. Fields that contain double quote
                        characters must be surrounded by double quotes, and the embedded double
                        quotes must each be represented by a pair of consecutive double quotes. It
                        is important to always open and close quotes correctly in order for data
                        rows to be parsed correctly.</pd>
                    </plentry>
                    <plentry>
                      <pt id="ERROR_LIMIT">ERROR_LIMIT</pt>
                      <pd>If the input rows have format errors they will be discarded provided that
                        the error limit count is not reached on any Greenplum segment instance
                        during input processing. If the error limit is not reached, all good rows
                        will be processed and any error rows discarded.</pd>
                    </plentry>
                  </parml><parml>
                    <plentry>
                      <pt id="ENCODING">ENCODING</pt>
                      <pd>Character set encoding to use for the data. Specify a string constant
                        (such as <codeph>'SQL_ASCII'</codeph>), an integer encoding number, or
                          <codeph>DEFAULT</codeph> to use the default client encoding. See <xref
                          href="../../ref_guide/character_sets.xml#topic1" type="topic"
                          format="dita"/> for more information.</pd>
                    </plentry>
                  </parml></pd>
              </plentry>
              <plentry>
                <pt id="OUTPUT">OUTPUT</pt>
                <pd>Optional. Defines where to output the formatted data of this MapReduce job. If
                  output is not defined, the default is <codeph>STDOUT</codeph> (standard output of
                  the client). You can send output to a file on the client host or to an existing
                  table in the database.<parml>
                    <plentry>
                      <pt id="OUTPUTNAME">NAME</pt>
                      <pd>A name for this output. The default output name is
                        <codeph>STDOUT</codeph>. Names must be unique with regards to the names of
                        other objects in this MapReduce job (such as map function, task, reduce
                        function and input names). Also, names cannot conflict with existing objects
                        in the database (such as tables, functions or views).</pd>
                    </plentry>
                    <plentry>
                      <pt id="OUTPUTFILE">FILE</pt>
                      <pd>Specifies a file location on the MapReduce client machine to output data
                        in the format: <codeph>/path/to/filename</codeph>.</pd>
                    </plentry>
                    <plentry>
                      <pt id="OUTPUTTABLE">TABLE</pt>
                      <pd>Specifies the name of a table in the database to output data. If this
                        table does not exist prior to running the MapReduce job, it will be created
                        using the distribution policy specified with <xref href="#topic1/KEYS"
                          format="dita">KEYS</xref>.</pd>
                    </plentry>
                    <plentry>
                      <pt id="KEYS">KEYS</pt>
                      <pd>Optional for <xref href="#topic1/OUTPUTTABLE" format="dita">TABLE</xref>
                        output. Specifies the column(s) to use as the Greenplum Database
                        distribution key. If the <xref href="#topic1/EXECUTE" format="dita"
                          >EXECUTE</xref> task contains a <xref href="#topic1/REDUCE" type="topic"
                          format="dita">REDUCE</xref> definition, then the <codeph>REDUCE</codeph>
                        keys will be used as the table distribution key by default. Otherwise, the
                        first column of the table will be used as the distribution key.</pd>
                    </plentry>
                    <plentry>
                      <pt id="MODE">MODE</pt>
                      <pd>Optional for <xref href="#topic1/OUTPUTTABLE" format="dita">TABLE</xref>
                        output. If not specified, the default is to create the table if it does not
                        already exist, but error out if it does exist. Declaring
                          <codeph>APPEND</codeph> adds output data to an existing table (provided
                        the table schema matches the output format) without removing any existing
                        data. Declaring <codeph>REPLACE</codeph> will drop the table if it exists
                        and then recreate it. Both <codeph>APPEND</codeph> and
                          <codeph>REPLACE</codeph> will create a new table if one does not
                        exist.</pd>
                    </plentry>
                  </parml></pd>
              </plentry>
              <plentry>
                <pt id="MAP">MAP</pt>
                <pd>Required. Each <codeph>MAP</codeph> function takes data structured in
                    (<codeph>key</codeph>, <codeph>value</codeph>) pairs, processes each pair, and
                  generates zero or more output (<codeph>key</codeph>, <codeph>value</codeph>)
                  pairs. The Greenplum MapReduce framework then collects all pairs with the same key
                  from all output lists and groups them together. This output is then passed to the
                    <xref href="#topic1/TASKREDUCE" format="dita">REDUCE</xref> task, which is
                  comprised of <xref href="#topic1/TCF" format="dita">TRANSITION | CONSOLIDATE |
                    FINALIZE</xref> functions. There is one predefined <codeph>MAP</codeph> function
                  named <codeph>IDENTITY</codeph> that returns (<codeph>key</codeph>,
                    <codeph>value</codeph>) pairs unchanged. Although (<codeph>key</codeph>,
                    <codeph>value</codeph>) are the default parameters, you can specify other
                  prototypes as needed.</pd>
              </plentry>
              <plentry>
                <pt id="TCF">TRANSITION | CONSOLIDATE | FINALIZE</pt>
                <pd><codeph>TRANSITION</codeph>, <codeph>CONSOLIDATE</codeph> and
                    <codeph>FINALIZE</codeph> are all component pieces of <xref
                    href="#topic1/REDUCE" format="dita">REDUCE</xref>. A <codeph>TRANSITION</codeph>
                  function is required. <codeph>CONSOLIDATE</codeph> and <codeph>FINALIZE</codeph>
                  functions are optional. By default, all take <codeph>state</codeph> as the first
                  of their input <xref href="#topic1/PARAMETERS" format="dita">PARAMETERS</xref>,
                  but other prototypes can be defined as well.</pd>
                <pd>
                  <p>A <codeph>TRANSITION</codeph> function iterates through each value of a given
                    key and accumulates values in a <codeph>state</codeph> variable. When the
                    transition function is called on the first value of a key, the
                      <codeph>state</codeph> is set to the value specified by <xref
                      href="#topic1/INITIALIZE" format="dita">INITIALIZE</xref> of a <xref
                      href="#topic1/REDUCE" format="dita">REDUCE</xref> job (or the default state
                    value for the data type). A transition takes two arguments as input; the current
                    state of the key reduction, and the next value, which then produces a new
                      <codeph>state</codeph>.</p>
                </pd>
                <pd>If a <codeph>CONSOLIDATE</codeph> function is specified,
                    <codeph>TRANSITION</codeph> processing is performed at the segment-level before
                  redistributing the keys across the Greenplum interconnect for final aggregation
                  (two-phase aggregation). Only the resulting <codeph>state</codeph> value for a
                  given key is redistributed, resulting in lower interconnect traffic and greater
                  parallelism. <codeph>CONSOLIDATE</codeph> is handled like a
                    <codeph>TRANSITION</codeph>, except that instead of <codeph>(state + value)
                    =&gt; state</codeph>, it is <codeph>(state + state) =&gt; state</codeph>.</pd>
                <pd>If a <codeph>FINALIZE</codeph> function is specified, it takes the final
                    <codeph>state</codeph> produced by <codeph>CONSOLIDATE</codeph> (if present) or
                    <codeph>TRANSITION</codeph> and does any final processing before emitting the
                  final result. <codeph>TRANSITION</codeph> and <codeph>CONSOLIDATE
                  </codeph>functions cannot return a set of values. If you need a
                    <codeph>REDUCE</codeph> job to return a set, then a <codeph>FINALIZE</codeph> is
                  necessary to transform the final state into a set of output values.<parml>
                    <plentry>
                      <pt id="TCFNAME">NAME</pt>
                      <pd>Required. A name for the function. Names must be unique with regards to
                        the names of other objects in this MapReduce job (such as function, task,
                        input and output names). You can also specify the name of a function
                        built-in to Greenplum Database. If using a built-in function, do not supply
                          <xref href="#topic1/LANGUAGE" format="dita">LANGUAGE</xref> or a <xref
                          href="#topic1/FUNCTION" format="dita">FUNCTION</xref> body.</pd>
                    </plentry>
                    <plentry>
                      <pt id="FUNCTION">FUNCTION</pt>
                      <pd>Optional. Specifies the full body of the function using the specified
                          <xref href="#topic1/LANGUAGE" format="dita">LANGUAGE</xref>. If
                          <codeph>FUNCTION</codeph> is not specified, then a built-in database
                        function corresponding to <xref href="#topic1/TCFNAME" format="dita"
                          >NAME</xref> is used.</pd>
                    </plentry>
                    <plentry>
                      <pt id="LANGUAGE">LANGUAGE</pt>
                      <pd>Required when <xref href="#topic1/FUNCTION" format="dita">FUNCTION</xref>
                        is used. Specifies the implementation language used to interpret the
                        function. This release has language support for <codeph>perl</codeph>,
                          <codeph>python</codeph>, and <codeph>C</codeph>. If calling a built-in
                        database function, <codeph>LANGUAGE</codeph> should not be specified.</pd>
                    </plentry>
                    <plentry>
                      <pt id="LIBRARY">LIBRARY</pt>
                      <pd>Required when <xref href="#topic1/LANGUAGE" format="dita">LANGUAGE</xref>
                        is C (not allowed for other language functions). To use this attribute,
                          <xref href="#topic1/VERSION" format="dita">VERSION</xref> must be 1.0.0.2.
                        The specified library file must be installed prior to running the MapReduce
                        job, and it must exist in the same file system location on all Greenplum
                        hosts (master and segments).</pd>
                    </plentry>
                    <plentry>
                      <pt id="PARAMETERS">PARAMETERS</pt>
                      <pd>Optional. Function input parameters. The default type is
                          <codeph>text</codeph>. <p><codeph>MAP</codeph> default - <codeph>key
                            text</codeph>, <codeph>value
                            text</codeph></p><p><codeph>TRANSITION</codeph> default - <codeph>state
                            text</codeph>, <codeph>value
                            text</codeph></p><p><codeph>CONSOLIDATE</codeph> default -
                            <codeph>state1 text</codeph>, <codeph>state2 text</codeph> (must have
                          exactly two input parameters of the same data
                            type)</p><p><codeph>FINALIZE</codeph> default - <codeph>state
                            text</codeph> (single parameter only)</p></pd>
                    </plentry>
                    <plentry>
                      <pt id="RETURNS">RETURNS</pt>
                      <pd>Optional. The default return type is
                            <codeph>text</codeph>.<p><codeph>MAP</codeph> default - <codeph>key
                            text</codeph>, <codeph>value
                            text</codeph></p><p><codeph>TRANSITION</codeph> default - <codeph>state
                            text</codeph> (single return value
                            only)</p><p><codeph>CONSOLIDATE</codeph> default - <codeph>state
                            text</codeph> (single return value only)</p><p><codeph>FINALIZE</codeph>
                          default - <codeph>value text</codeph></p></pd>
                    </plentry>
                    <plentry>
                      <pt id="OPTIMIZE">OPTIMIZE</pt>
                      <pd>Optional optimization parameters for the
                            function:<p><codeph>STRICT</codeph> - function is not affected by
                            <codeph>NULL</codeph> values </p><p><codeph>IMMUTABLE</codeph> -
                          function will always return the same value for a given input</p></pd>
                    </plentry>
                    <plentry>
                      <pt id="TCFMODE">MODE</pt>
                      <pd>Optional. Specifies the number of rows returned by the function.
                            <p><codeph>MULTI</codeph> - returns 0 or more rows per input record. The
                          return value of the function must be an array of rows to return, or the
                          function must be written as an iterator using <codeph>yield</codeph> in
                          Python or <codeph>return_next</codeph> in Perl. <codeph>MULTI</codeph> is
                          the default mode for <codeph>MAP</codeph> and <codeph>FINALIZE</codeph>
                          functions.</p><p><codeph>SINGLE</codeph> - returns exactly one row per
                          input record. <codeph>SINGLE</codeph> is the only mode supported for
                            <codeph>TRANSITION</codeph> and <codeph>CONSOLIDATE</codeph> functions.
                          When used with <codeph>MAP</codeph> and <codeph>FINALIZE</codeph>
                          functions, <codeph>SINGLE</codeph> mode can provide modest performance
                          improvement.</p></pd>
                    </plentry>
                  </parml></pd>
              </plentry>
              <plentry>
                <pt id="REDUCE">REDUCE</pt>
                <pd>Required. A <codeph>REDUCE</codeph> definition names the <xref
                    href="#topic1/TCF" format="dita">TRANSITION | CONSOLIDATE | FINALIZE</xref>
                  functions that comprise the reduction of (<codeph>key</codeph>,
                    <codeph>value</codeph>) pairs to the final result set. There are also several
                  predefined <codeph>REDUCE</codeph> jobs you can run, which all operate over a
                  column named <codeph>value</codeph>:<p><codeph>IDENTITY</codeph> - returns (key,
                    value) pairs unchanged</p><p><codeph>SUM</codeph> - calculates the sum of
                    numeric data </p><p><codeph>AVG</codeph> - calculates the average of numeric
                    data</p><p><codeph>COUNT</codeph> - calculates the count of input
                      data</p><p><codeph>MIN</codeph> - calculates minimum value of numeric data
                      </p><p><codeph>MAX</codeph> - calculates maximum value of numeric data</p><parml>
                    <plentry>
                      <pt id="REDUCENAME">NAME</pt>
                      <pd>Required. The name of this <codeph>REDUCE</codeph> job. Names must be
                        unique with regards to the names of other objects in this MapReduce job
                        (function, task, input and output names). Also, names cannot conflict with
                        existing objects in the database (such as tables, functions or views).</pd>
                    </plentry>
                    <plentry>
                      <pt id="TRANSITION">TRANSITION</pt>
                      <pd>Required. The name of the <codeph>TRANSITION</codeph> function.</pd>
                    </plentry>
                    <plentry>
                      <pt id="CONSOLIDATE">CONSOLIDATE</pt>
                      <pd>Optional. The name of the <codeph>CONSOLIDATE</codeph> function.</pd>
                    </plentry>
                    <plentry>
                      <pt id="FINALIZE">FINALIZE</pt>
                      <pd>Optional. The name of the <codeph>FINALIZE</codeph> function.</pd>
                    </plentry>
                    <plentry>
                      <pt id="INITIALIZE">INITIALIZE</pt>
                      <pd>Optional for <codeph>text</codeph> and <codeph>float</codeph> data types.
                        Required for all other data types. The default value for text is
                          <codeph>''</codeph> . The default value for float is <codeph>0.0</codeph>
                        . Sets the initial <codeph>state</codeph> value of the
                          <codeph>TRANSITION</codeph> function.</pd>
                    </plentry>
                    <plentry>
                      <pt id="REDUCEKEYS">KEYS</pt>
                      <pd>Optional. Defaults to <codeph>[key, *]</codeph>. When using a multi-column
                        reduce it may be necessary to specify which columns are key columns and
                        which columns are value columns. By default, any input columns that are not
                        passed to the <codeph>TRANSITION</codeph> function are key columns, and a
                        column named <codeph>key</codeph> is always a key column even if it is
                        passed to the <codeph>TRANSITION</codeph> function. The special indicator
                          <codeph>*</codeph> indicates all columns not passed to the
                          <codeph>TRANSITION</codeph> function. If this indicator is not present in
                        the list of keys then any unmatched columns are discarded.</pd>
                    </plentry>
                  </parml></pd>
              </plentry>
              <plentry>
                <pt id="TASK">TASK</pt>
                <pd>Optional. A <codeph>TASK</codeph> defines a complete end-to-end
                    <codeph>INPUT</codeph>/<codeph>MAP</codeph>/<codeph>REDUCE</codeph> stage within
                  a Greenplum MapReduce job pipeline. It is similar to <xref href="#topic1/EXECUTE"
                    format="dita">EXECUTE</xref> except it is not immediately run. A task
                  object can be called as <xref href="#topic1/INPUT" format="dita">INPUT</xref> to
                  further processing stages.<parml>
                    <plentry>
                      <pt id="TASKNAME">NAME</pt>
                      <pd>Required. The name of this task. Names must be unique with regards to the
                        names of other objects in this MapReduce job (such as map function, reduce
                        function, input and output names). Also, names cannot conflict with existing
                        objects in the database (such as tables, functions or views).</pd>
                    </plentry>
                    <plentry>
                      <pt id="SOURCE">SOURCE</pt>
                      <pd>The name of an <xref href="#topic1/INPUT" format="dita">INPUT</xref> or
                        another <codeph>TASK</codeph>.</pd>
                    </plentry>
                    <plentry>
                      <pt id="TASKMAP">MAP</pt>
                      <pd>
                        <p>Optional. The name of a <xref href="#topic1/MAP" format="dita">MAP</xref>
                          function. If not specified, defaults to <codeph>IDENTITY</codeph>.</p>
                      </pd>
                    </plentry>
                    <plentry>
                      <pt id="TASKREDUCE">REDUCE</pt>
                      <pd>
                        <p>Optional. The name of a <xref href="#topic1/REDUCE" format="dita"
                            >REDUCE</xref> function. If not specified, defaults to
                            <codeph>IDENTITY</codeph>.</p>
                      </pd>
                    </plentry>
                  </parml></pd>
              </plentry>
            </parml></pd>
        </plentry>
        <plentry>
          <pt id="EXECUTE">EXECUTE</pt>
          <pd>Required. <codeph>EXECUTE</codeph> defines the final
              <codeph>INPUT</codeph>/<codeph>MAP</codeph>/<codeph>REDUCE</codeph> stage within a
            Greenplum MapReduce job pipeline.<parml>
              <plentry>
                <pt id="RUN">RUN</pt>
                <pd>
                  <parml>
                    <plentry>
                      <pt id="EXECUTESOURCE">SOURCE</pt>
                      <pd>Required. The name of an <xref href="#topic1/INPUT" format="dita"
                          >INPUT</xref> or <xref href="#topic1/TASK" format="dita">TASK</xref>.</pd>
                    </plentry>
                    <plentry>
                      <pt id="TARGET">TARGET</pt>
                      <pd>
                        <p>Optional. The name of an <xref href="#topic1/OUTPUT" format="dita"
                            >OUTPUT</xref>. The default output is <codeph>STDOUT</codeph>.</p>
                      </pd>
                    </plentry>
                    <plentry>
                      <pt id="EXECUTEMAP">MAP</pt>
                      <pd>Optional. The name of a <xref href="#topic1/MAP" format="dita">MAP</xref>
                        function. If not specified, defaults to <codeph>IDENTITY</codeph>.</pd>
                    </plentry>
                    <plentry>
                      <pt id="EXECUTEREDUCE">REDUCE</pt>
                      <pd>Optional. The name of a <xref href="#topic1/REDUCE" format="dita"
                          >REDUCE</xref> function. Defaults to <codeph>IDENTITY</codeph>.</pd>
                    </plentry>
                  </parml>
                </pd>
              </plentry>
            </parml></pd>
        </plentry>
      </parml>
    </section>
    <section id="section11">
      <title>See Also</title>
      <p><xref href="gpmapreduce.xml#topic1" type="topic" format="dita"/></p>
    </section>
  </body>
</topic>
